Statement Of Contribution:
Assignment 1: Dinesh Sundaramoorthy (dinsu875)
Assignment 2: Jin Yan (jinya425)
if (file.exists(".mapbox_api_token")) {
api_key <- paste(readLines(".mapbox_api_token"), collapse = "")
Sys.setenv("MAPBOX_TOKEN" = api_key)
}
data<-read.csv("aegypti_albopictus.csv", header = TRUE, sep = ",")
# 1.1
data %>%
#selecting variables from the data
dplyr::select(VECTOR, Y, X, YEAR, COUNTRY) %>%
# filtering by year
filter(YEAR == 2004) %>% #split here is used to give different colors.
plot_mapbox(x = ~X, y =~Y, split = ~VECTOR,
mode = "scattermapbox", hoverinfo = "name" ) %>%
layout( title = "Mosquito Data Insights(2004)",
mapbox = list(style = "light"),
margin = list(r = 20, l = 20, b = 20, t = 20, pad = 0.5),
legend = list(orientation = "h", font = list(size = 8))
)
# 1.1.2
data %>%
dplyr::select(VECTOR, Y, X, YEAR, COUNTRY) %>%
filter(YEAR == 2013) %>%
plot_mapbox(x = ~X, y =~Y, split = ~VECTOR,
mode = "scattermapbox", hoverinfo = "name" ) %>%
layout( title = "Mosquito Data Insights(2013)",
mapbox = list(style = "light"),
margin = list(r = 20, l = 20, b = 20, t = 20, pad = 0.5),
legend = list(orientation = "h", font = list(size = 8))
)
In 2004,the aedes albopitus distributed mainly near the tropic of cancer,especially in Taiwan,but some of them were spotted in Indonesia and the Southern Pacific Ocean, Meanwhile,Most of the aedes aegypti distributed in Mexico and South America,but some are found in Africa and southern Asia. In 2013,the amount of aedes albopictus had reduced rapidly,and mainly distributed in Taiwan and Northern Italy. Meanwhile,the amount of aedes aegypti had increased sharply,the distribution of aedes aegypti is concentrated in South America,especially in Brazil,where a great amount number of aedes aegypti had been spotted. In the 2nd figure,the amount of the points is too high,so that they would merge together,we could not see clearly,but we could still find the distribution of them roughly.
There is certain information in the map, because Brazil have had 8501 samples and Taiwan have had nearly 25k,compare to other countries,the numbers of mosquitos was much higher,so other countries finally turn out grey in the map compare to Brazil and Taiwan. We could barely see the difference between the numbers of mosquitoes of those countries.
# 1.2
g<-list(
projection = list(type = "equirectangular") # this determine the way of projection
)
data %>%
group_by(COUNTRY) %>%
mutate(Z = n()) %>%
plot_geo() %>%
add_trace(
z =~Z, name = 'Mosquito (Z)', color =~Z, color = "reds",
locations =~COUNTRY_ID
) %>%
layout(
title = "Geographic Distribution of Mosquitoes by Country",
geo = g
)
# 1.3
data %>%
group_by(COUNTRY) %>%
mutate(Z = log(n())) %>%
plot_geo() %>%
add_trace(
z =~Z, name = 'Mosquito (Z)', color =~Z, color = "reds",
locations =~COUNTRY_ID
) %>%
layout(
title = "Geographic Distribution of Mosquitoes by Country(A)",
geo = g
)
# 1.3.2
data %>%
group_by(COUNTRY) %>%
mutate(Z = log(n())) %>%
plot_geo() %>%
add_trace(
z =~Z, name = 'Mosquito (Z)', color =~Z, color = "reds",
locations =~COUNTRY_ID
) %>%
layout(
title = "Geographic Distribution of Mosquitoes by Country(B)",
geo = list(
projection = list(type = "conic equal area")
)
)
Normalized makes it easy to see the difference of the number of mosquitoes in each countries by color regardless how big the difference is between one country and the others.
In fig3,The Equirectangular projection (i.e. the Mercator projection) zooms in too much at high latitudes and zooms out too much at low latitudes, so we would feel that Greenland is about the size of Australia, but it fits countries around the Tropic of Cancer, In contrast, in fig4,the Conic equal area projection (i.e. Albers projection) has the same problem, so it is only suitable for mid-latitude countries with large east-west spacing.
The most severe area infested by mosquitoes are Natal,Rio de Janeiro,Sao Paulo and Maringa. Such discretization will undoubtly help in analyzing the distribution of mosquitoes,cause the sample points won’t merge together and we could see the amount of mosquitoes in the areas by the color of the scatter points.
# 1.4
data %>%
dplyr::select(VECTOR, Y, X, YEAR, COUNTRY) %>%
filter(YEAR == 2013, COUNTRY == "Brazil") %>%
mutate(X1 = cut_interval(X, 100)) %>%
mutate(Y1 = cut_interval(Y, 100)) %>%
group_by(X1,Y1) %>%
summarise(mean_x = mean(X), mean_y =mean(Y), N = n()) %>%
plot_mapbox( mode = "scattermapbox", hoverinfo = "name") %>%
add_markers(x = ~mean_x, y = ~mean_y, split =~N) %>%
#split here is used to add color
layout( title = "Brazilian Mosquito Data(2013)",
mapbox = list(style = "light"),
margin = list(r = 20, l = 20, b = 20, t = 20, pad = 0.5),
legend = list(orientation = "h", font=list(size = 8))
)
## `summarise()` has grouped output by 'X1'. You can override using the `.groups`
## argument.
# 2.1
geo_data = st_read("gadm41_SWE_1.json")
## Reading layer `gadm41_SWE_1' from data source
## `C:\Users\yj313\Desktop\LAB\Visualization\lab_visual\lab3\gadm41_SWE_1.json'
## using driver `GeoJSON'
## Simple feature collection with 21 features and 11 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 10.9614 ymin: 55.3363 xmax: 24.1724 ymax: 69.059
## Geodetic CRS: WGS 84
ggplot() + geom_sf(data = geo_data)
mydata = read.csv("000006SW_20230916-191349.csv", encoding = "latin1")
library(tidyr)
library(dplyr)
mydata_pivot = mydata%>%
group_by(region)%>%
pivot_wider(names_from = age, values_from = X2016)
colnames(mydata_pivot) = c("region", "Young", "Adult", "Senior")
# 2.2
colnames(mydata) = c("region", "age", "income")
plot_vio = plot_ly(mydata, x = ~factor(age), y = ~income, split=~factor(age),
type="violin", box=list(visible=T))
plot_vio
Analysis: As people grow older, their income would increase compared when they are young. When they become seniors, their income increase compared to when they are adults.But the change is not as high as before.
# 2.3
attach(mydata_pivot)
s = interp(Young, Adult, Senior, duplicate = "mean")
detach(mydata_pivot)
plot_ly(x=~s$x, y=~s$y, z=~s$z, type="surface")
Analysis:In general, when the average income for young people and adults are high, the average income for the seniors would be high. I think the linear regression can explain the relationship between the income for the senior and that for the young or adults.
# 2.4
json<-fromJSON(file="gadm41_SWE_1.json")
df<-mydata_pivot
#See the structure of some region:
print(json$features[[2]]$properties)
## $GID_1
## [1] "SWE.2_1"
##
## $GID_0
## [1] "SWE"
##
## $COUNTRY
## [1] "Sweden"
##
## $NAME_1
## [1] "Dalarna"
##
## $VARNAME_1
## [1] "Dalecarlia|Kopparberg"
##
## $NL_NAME_1
## [1] "NA"
##
## $TYPE_1
## [1] "Län"
##
## $ENGTYPE_1
## [1] "County"
##
## $CC_1
## [1] "NA"
##
## $HASC_1
## [1] "SE.KO"
##
## $ISO_1
## [1] "NA"
# the following four lines are used to clean data
region_new = str_remove_all(df$region, "county")
region_new = gsub("[[:digit:]]+","",region_new)
#this is used to remove the numbers
region_new = as.vector(region_new)
# make it vector and it can be processed in the following step.
df$region = trimws(region_new)
g=list(fitbounds="locations", visible=FALSE)
p<-plot_geo(df)%>%
add_trace(type="choropleth",geojson=json, locations=~region,
z=~Young, featureidkey="properties.NAME_1")%>%
layout(geo=g)
p <- p %>% layout(title = "Income of young in different regions in Sweden")
p
p_2<-plot_geo(df)%>%
add_trace(type="choropleth",geojson=json, locations=~region,
z=~Adult, featureidkey="properties.NAME_1")%>%
layout(geo=g)
p_2 <- p_2 %>% layout(title = "Income of adults in different regions in Sweden")
p_2
Analysis: From the plot we can see that no matter for the young or for adults, the income level for Stockholms and Halland are the highest. By comparison, we can see the income level for adults living in northern counties of Sweden are higher than southern counties.
# 2.5
p%>%add_markers(x=15.62565, y=58.41109, text="Linköping", color=I("red"))